This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(emoji)
library(gplots)
library(gtools)
library(here)
library(hyperSpec)
library(limma)
library(magrittr)
library(parallel)
library(patchwork)
library(PCAtools)
library(pheatmap)
library(plotly)
library(pvclust)
library(RColorBrewer)
library(tidyverse)
library(tximport)
library(vsn)
})filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)Sort the raw data and samples
samples <- samples[match(basename(dirname(filelist)),samples$SampleID),]
stopifnot(all(samples$SampleID == basename(dirname(filelist))))Read the expression at the transcript level (we have no gene information)
Read the algae IDs
Subset the data
txi[!names(txi) == "countsFromAbundance"] <- lapply(txi[!names(txi) == "countsFromAbundance"],function(l){l[IDs,]})
stopifnot(all(sapply(lapply(lapply(txi[!names(txi) == "countsFromAbundance"],rownames),"==",IDs),all)))
counts <- txi$counts
colnames(counts) <- samples$SampleIDsel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "4.5% percent (2233) of 49477 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
levels = c("std", "60min", "4hrs", "12hrs", "24hrs", "72hrs", "120hrs")
ggplot(dat,aes(x,y,fill=factor(Time, levels))) +
geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=10),
axis.title.x=element_blank()) +
labs(fill = "Time")👉 We observe almost no difference in the raw sequencing depth
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")👉 The cumulative gene coverage is as expected
The same is done for the individual samples colored by Time
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Time=samples$Time[match(ind,samples$SampleID)])
dat$Time <- factor(dat$Time, levels)
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at the Time variable
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
dds <- DESeqDataSetFromTximport(
txi=txi,
colData = samples,
design = ~ Time) %>%
suppressMessages()
save(dds,file=here("data/analysis/salmon/dds-sample-swap-corrected.rda"))
colnames(dds) <- paste(samples$Time,samples$Replicate,sep="_")Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds) %>%
suppressMessages()
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 There is almost no differences in the libraries’ size factors. They are all within +/- 30% of the average library size.
Assess whether there might be a difference in library size linked to a given metadata
boxplot(split(t(normalizationFactors(dds)),dds$Time),las=2,
main="Sequencing libraries size factor by Time",
outline=FALSE)👉 The scaling factor distribution is not independent from the Time variable.
plot(colMeans(normalizationFactors(dds)),
log10(colSums(counts(dds))),ylab="log10 raw depth",
xlab="average scaling factor",
col=rainbow(n=nlevels(dds$Time))[as.integer(dds$Time)],pch=19)
legend("bottomright",fill=rainbow(n=nlevels(dds$Time)),
legend=levels(dds$Time),cex=0.6)👉 The scaling factor appear linearly proportional to the sequencing depth for all samples but Time 4hrs. This might indicate that the number of genes expressed at 4hrs differs from the other samples.
vsd <- varianceStabilizingTransformation(dds, blind=FALSE) %>% suppressMessages()
vst <- assay(vsd)
vst <- vst - min(vst)let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.
Before:
After:
After VST normalization, the red line is almost horizontal which indicates no dependency of variance on mean (homoscedastic).
👉 We can conclude that the variance stabilization worked adequately
Using PCAtools
We define the number of variable of the model
An the number of possible combinations
We devise the optimal number of components using two methods
We plot the percentage explained by different components and try to empirically assess whether the observed number of components would be in agreement with our model’s assumptions.
ggplot(tibble(x=1:length(percent),y=cumsum(percent),p=percent),aes(x=x,y=y)) +
geom_line() + geom_col(aes(x,p)) + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component",breaks=1:length(percent),minor_breaks=NULL) +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=c(horn$n,elbow),colour="black",linetype="dotted",linewidth=0.5) +
geom_label(aes(x = horn$n + 1, y = cumsum(percent)[horn$n],label = 'Horn', vjust = 1)) +
geom_label(aes(x = elbow + 1, y = cumsum(percent)[elbow],label = 'Elbow', vjust = 1))👉 The first component explains 27% of the data variance. Both metrics, Horn and Elbow suggest that five or six components are those that are informative. Indeed the slope of the curve is fairly linear past PC7 and that would indicate that the remaining PCs only capture sample specific noise. While this is only empirical, the scree plot support having only few variables of importance in the dataset.
p1 <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p1) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,
colby = "Time",
colLegendTitle = "Time",
lab=samples$Replicate,
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
👉 The first dimension separates the control and later samples, from the very early (1h centered) and early ones (4-12h).
p2 <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=Time,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p2) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,x = 'PC1', y = 'PC3',
colby = 'Time',
colLegendTitle = 'Time',
lab=samples$Replicate,
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)subplot(style(p1, showlegend = F), p2,
titleX = TRUE, titleY = TRUE, nrows = 1, margin = c(0.05, 0.05, 0, 0))👉 The third dimension separates the very late samples (72-120h) from the 24h ones
This allows for looking at more dimensions, five by default
👉 All the first five PCs can be explained by an effect of the Time variable
Loadings, i.e. the individual effect of every gene in a component can be studied. Here the most important ones are visualized throughout the different PCs
plotloadings(p,
rangeRetain = 0.01,
labSize = 4.0,
title = 'Loadings plot',
subtitle = 'PC1 to PC5',
caption = 'Top 1% variables',
drawConnectors = TRUE)## -- variables retained:
## TRINITY_DN1311_c0_g1_i2, TRINITY_DN574_c0_g1_i5, TRINITY_DN6996_c1_g1_i3, TRINITY_DN509_c1_g1_i10, TRINITY_DN22674_c8_g1_i5, TRINITY_DN3892_c0_g1_i10, TRINITY_DN6482_c0_g2_i12, TRINITY_DN3566_c0_g1_i9, TRINITY_DN3833_c0_g1_i12, TRINITY_DN2727_c0_g1_i1, TRINITY_DN1240_c0_g2_i2
## Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
👉 It could be interesting to lookup the annotations of some of these genes.
This is a plot showing the correlation between the PC and the model variables. Note that while this might be relevant for a linear variable, it is less so for categorical variables. Sorting categorical variables in a linear order according to the PCs above might help.
p$metadata$Minutes <- as.integer(sub("std",0,sub("60min",1,sub("hrs","",samples$Time))))
p$metadata$Response <- factor(levels=c("control","acute","early","late"),
sub(".*hrs","late",
gsub("12hrs|^4hrs","early",
sub("60min","acute",
sub("std","control",samples$Time)))))
suppressWarnings(eigencorplot(p,metavars=c('Minutes','Response')))👉 The first three components are associated with either variables.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(sampleDistMatrix) <- dds$SampleID
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=pal,labels_row=samples$Time,
labels_col=samples$Time)👉 4h and 12h are the most distant time points, with 4h being the furthest away. 72 and 120 cannot be separated. 1h and 24h are close to the control, in that order.
The figures show the number of genes expressed per condition at different expression cutoffs. The scale on the lower plot is the same as on the upper. The first plot is a heatmap showing the number of genes above a given cutoff. The second plot shows it as a ratio of the number of genes expressed for (a) given variable(s) divided by the average number of genes expressed at that cutoff across all variable(s). The latter plot is of course biased at higher cutoff as the number of genes becomes smaller and smaller. The point of these two plots is to assert whether the number of genes expressed varies between conditions, as this would break some assumptions for normalisation and differential expression.
👉 As suggested by the initial QC, 4hrs is somewhat of an outlier in the numbers of genes expressed.
Plotting the number of genes that are expressed (at any level)
do.call(rbind,split(t(nrow(vst) - colSums(vst==0)),samples$Time)) %>% as.data.frame() %>%
rownames_to_column("Time") %>% pivot_longer(starts_with("V")) %>%
ggplot(aes(x=Time, y=value,fill=Time)) + geom_dotplot(binaxis = "y", stackdir = "center") +
scale_y_continuous("# of expressed genes")## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
👉 There is a clear effect on both 4h and 12h. Whether that explains part of PC1 is to be kept in mind.
Here we want to visualise all the informative genes as a heatmap. We first filter the genes to remove those below the selected noise/signal cutoff. The method employed here is naive, and relies on observing a sharp decrease in the number of genes within the first few low level of expression. Using an independent filtering method, such as implemented in DESeq2 would be more accurate, but for the purpose of QA validation, a naive approach is sufficient. Note that a sweet spot for computation is around 20 thousand genes, as building the hierarchical clustering for the heatmap scales non-linearly.
👉 Here a cutoff of 1 is selected
vst.cutoff <- 1
nn <- nrow(vst[sels[[vst.cutoff+1]],])
tn <- nrow(vst)
pn <- round(nn * 100/ tn, digits=1)⚠️ 48.8% (24142) of total 49477 genes are plotted below:
annotation column
colnames(mat) <- gsub("B2_2_","",colnames(mat))
col_anno <- samples %>% select(Time) %>% as.data.frame()
col_anno$Time <- factor(col_anno$Time, levels)
rownames(col_anno) <- colnames(mat)annotation colors
col_anno_col = brewer.pal(nlevels(conds),"Dark2")
names(col_anno_col) <- levels
col_anno_col=list(Time = col_anno_col)
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_cols = "correlation",
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
annotation_col = col_anno,
show_rownames = FALSE,
labels_col = conds,
angle_col = 90,
annotation_colors = col_anno_col,
legend = FALSE)👉 The heatmap shows a similar pattern to the PCA. Even if 4h could be biased as it has 10% less expressed genes, the changes observed are 4h are too drastic to be due to that alone.
Below we assess the previous dendrogram’s reproducibility and plot the clusters with au and bp where:
⚠️Clusters with AU larger than 95% are highlighted by rectangles, which are strongly supported by data
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 100, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 0.953 0.976 0.980 0.056 0.033 0.006 -2.015 -0.044 0.254
## 3 0.919 0.955 0.976 0.096 0.062 0.008 -1.836 -0.140 0.293
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 10 0.813 0.874 0.980 0.144 0.114 0.006 -1.598 -0.451 0.641
## 11 0.983 0.993 0.977 0.022 0.010 0.007 -2.233 0.236 0.987
## 12 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 14 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 15 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 16 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 17 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 18 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 19 0.943 0.972 0.970 0.049 0.029 0.006 -1.897 0.011 0.502
## 20 0.963 0.985 0.957 0.032 0.016 0.007 -1.941 0.224 0.449
## 21 0.964 0.986 0.956 0.031 0.015 0.008 -1.947 0.237 0.410
## 22 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 23 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 24 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 25 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 26 0.982 0.992 0.988 0.034 0.019 0.006 -2.324 0.067 0.872
## 27 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
⭐ The data looks very good after fixing the earlier samples swap.
⭐ Both PCA and heatmap show separation between different times except for 72 and 120 hours which are mixed together.
⭐ The 4h samples and to a lesser degree the 12h samples have up to 10% less genes expressed than the other samples. This could introduce a bias in the expression quantification, albeit the changes observed in the heatmap suggests that this effect has a technical origin rather than a biological origin: due to the higher expression levels at 4h, the sequencing depth would be shallower and lowly expressed genes would be drop-outs (while the gene is expressed in the sample, it is not being recorded)
⭐ In PCA plots, the first component explains the majority of variation (27%) and separates samples at 1,4 and 12 hours from the rest (std, 24,72 and 120 hrs). The second component explains further variation (16%) separating early stages (std, 1 and 4 hours) from later stages.
⭐ Together the PCA and heatmap plots suggest that 4 and 12hrs conditions have the strongest effect. This is less pronounced at 60 min but still there is nonetheless a clear effect. While 72hr and 120hr clustered separately from other time points, they do not show much difference between themselves.
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.68.0 tximport_1.28.0
## [3] lubridate_1.9.2 forcats_1.0.0
## [5] stringr_1.5.0 dplyr_1.1.3
## [7] purrr_1.0.2 readr_2.1.4
## [9] tidyr_1.3.0 tibble_3.2.1
## [11] tidyverse_2.0.0 RColorBrewer_1.1-3
## [13] pvclust_2.2-0 plotly_4.10.2
## [15] pheatmap_1.0.12 PCAtools_2.12.0
## [17] ggrepel_0.9.3 patchwork_1.1.3
## [19] magrittr_2.0.3 limma_3.56.2
## [21] hyperSpec_0.100.0 xml2_1.3.5
## [23] ggplot2_3.4.3 lattice_0.21-8
## [25] here_1.0.1 gtools_3.9.4
## [27] gplots_3.1.3 emoji_15.0
## [29] DESeq2_1.40.2 SummarizedExperiment_1.30.2
## [31] Biobase_2.60.0 MatrixGenerics_1.12.3
## [33] matrixStats_1.0.0 GenomicRanges_1.52.0
## [35] GenomeInfoDb_1.36.3 IRanges_2.34.1
## [37] S4Vectors_0.38.1 BiocGenerics_0.46.0
## [39] data.table_1.14.8
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.15.0 jsonlite_1.8.7
## [3] farver_2.1.1 rmarkdown_2.24
## [5] zlibbioc_1.46.0 vctrs_0.6.3
## [7] DelayedMatrixStats_1.22.6 RCurl_1.98-1.12
## [9] htmltools_0.5.6 S4Arrays_1.0.6
## [11] proj4_1.0-13 sass_0.4.7
## [13] KernSmooth_2.23-22 bslib_0.5.1
## [15] htmlwidgets_1.6.2 plyr_1.8.8
## [17] testthat_3.1.10 cachem_1.0.8
## [19] ggalt_0.4.0 lifecycle_1.0.3
## [21] pkgconfig_2.0.3 rsvd_1.0.5
## [23] Matrix_1.6-0 R6_2.5.1
## [25] fastmap_1.1.1 GenomeInfoDbData_1.2.10
## [27] digest_0.6.33 colorspace_2.1-0
## [29] rprojroot_2.0.3 dqrng_0.3.1
## [31] irlba_2.3.5.1 crosstalk_1.2.0
## [33] beachmat_2.16.0 labeling_0.4.3
## [35] fansi_1.0.4 timechange_0.2.0
## [37] httr_1.4.7 abind_1.4-5
## [39] compiler_4.3.1 bit64_4.0.5
## [41] withr_2.5.0 BiocParallel_1.34.2
## [43] hexbin_1.28.3 Rttf2pt1_1.3.12
## [45] maps_3.4.1 MASS_7.3-60
## [47] DelayedArray_0.26.7 caTools_1.18.2
## [49] tools_4.3.1 extrafontdb_1.0
## [51] glue_1.6.2 reshape2_1.4.4
## [53] generics_0.1.3 gtable_0.3.4
## [55] tzdb_0.4.0 preprocessCore_1.62.1
## [57] hms_1.1.3 BiocSingular_1.16.0
## [59] ScaledMatrix_1.8.1 utf8_1.2.3
## [61] XVector_0.40.0 pillar_1.9.0
## [63] vroom_1.6.3 bit_4.0.5
## [65] deldir_1.0-9 tidyselect_1.2.0
## [67] locfit_1.5-9.8 knitr_1.44
## [69] xfun_0.40 brio_1.1.3
## [71] stringi_1.7.12 lazyeval_0.2.2
## [73] yaml_2.3.7 evaluate_0.21
## [75] codetools_0.2-19 interp_1.1-4
## [77] extrafont_0.19 BiocManager_1.30.22
## [79] cli_3.6.1 affyio_1.70.0
## [81] ash_1.0-15 munsell_0.5.0
## [83] jquerylib_0.1.4 Rcpp_1.0.11
## [85] png_0.1-8 ellipsis_0.3.2
## [87] latticeExtra_0.6-30 jpeg_0.1-10
## [89] sparseMatrixStats_1.12.2 bitops_1.0-7
## [91] viridisLite_0.4.2 scales_1.2.1
## [93] affy_1.78.2 crayon_1.5.2
## [95] rlang_1.1.1 cowplot_1.1.1
Created by Aman Zare